/*******************************************************************************
* Date: November 6, 2018
* Code for creating a wind and pollution dataset for Florida without dropping 
* too many monitors

a) Dropping criteria:  
 This .do file drops wind sites with less than 365 observations. 

Why 365? The wind data is hourly data (and some wind monitors have multiple obs
per hour). The 365 rule asks for just one observation per day.

b) Imputation: 
This .do file fills in missing wind observations. The rules for filling out missing data are the following: 
CARRYFORWARD, then backward; limit to 1 or 3 days out, or no restrictions

Lines 127 - 160: Some checks about which stations we're dropping using different criteria
Lines 179 - 211: Code for circmean
Lines 283 - 380: Code for filling in missing wind observations

Instructions for running this file:
1. To specify maximum distance between EPA pollution monitor and MADIS wind monitor
	change scalar prefdist on line 63
2. To get a dataset for a different year:
	a. change the loop specification on line 78, right after STEP 1
	b. change scalar runSFC_FL to 1 on line 55 (so that a Florida-wide wind direction data set
		is compiled)
	c. change scalar runEPA to 1 on line 56 (so that you get the EPA sites reporting in that
		specific year)	
3. If you get an error saying "dist_ bad: integer variables required", you have the
	wrong version of rowsort; install version SJ9-1: pr0046
4. If you haven't already, install carryforward, nearstat, rowsort, dropmiss
********************************************************************************/

clear all
set more off
set maxvar 32767
ssc install nearstat
ssc install carryforward

global main "B:\SFC data"

scalar runSFC_FL = 1 	// run the code that appends wind direction data for all of Florida for a particular year
scalar runEPA = 1 		// run code that creates EPAlonlat_FL.dta (just lat lon of all EPA sites in FL)

cd "${main}" 

****************
****************
// currently used on line 278
scalar prefdist = 8 // specify max distance between EPA site and wind monitors (any site further than this value is excluded)
****************	
****************




*******************************************************************************
// 					STEP 1: Clean wind direction data 

// Drop monitors with too few observations by any standards (<365 obs)
// When more than one observation within an hour, drop duplicates; if distinct 
// observations, get minmode and maxmode (degrees north)
// Create cardinal wind direction based on min/maxmodes
*******************************************************************************
forval yy = 2007/2012 {

**********************
* First, append SFC data for all of Florida for a certain year
**********************
	if runSFC_FL {
		di "******** `yy' *********"

		local filelist: dir . files "sfc_`yy'*"
		local firstfile "sfc_`yy'0101.dta"
		local others: list filelist - firstfile
		use `firstfile'
		append using `others'

		// keep only wind direction 
		keep if sfcvar=="DD"
		gen varlabel = "Wind direction"
		gen units = "deg north" 
		
		// keep only observations between 8am and 3pm
		keep if hour>=8 & hour<=15
		
		drop datetime
		gen dd = date(time, "YMDhm")
		format dd %td
		gen tt = hms(hour, minute, 00) 
		format tt %tcHH:MM
		gen double datetime = dd*24*60*60*1000 + tt
		format datetime %tcNN/DD/CCYY_HH:MM
		rename (dd tt) (datelocal timelocal)
		
		sort station datetime

		gen season = ""
		replace season = "spring" if month == 2 | month == 3 | month == 4
		replace season = "summer" if month == 5 | month == 6 | month == 7
		replace season = "fall" if month == 8 | month == 9 | month == 10
		replace season = "winter" if month == 11 | month == 12 | month == 1

		cd "${main}"
		save Florida_winddir_`yy', replace
		
	} // if runSFC_FL


	// use data set on Florida wind direction from 8am to 3pm
	use "${main}\Florida_winddir_`yy'", clear
	keep if year == `yy'

	/****************************************************************************
	// Do some graphical analysis to see what stations to drop 
	// Get a distribution of sites by number of observations
	bysort station: egen nobs = count(value)
	preserve
	duplicates drop station nobs, force		// there are 998 stations
	gen b365 = (nobs<=365)
	tab b365 	// 358 stations have at most 365 observations 
	levelsof station if b365==1, loc(stlow) clean
	distplot nobs, frequency 
	restore
	// Conclusion: 154 sites have 1 observation. 287 have less than 10. 
	// 358 sites have less than 365 obs. 
	// 377 sites have less than 1000 observations. 
	
	// Of those that have few observations, inspect the distribution of obs across months
	preserve
	gen keeper = 0
	foreach ss of local stlow {
		replace keeper = 1 if station=="`ss'"
	}
	keep if keeper == 1
	duplicates drop station nobs month, force
	bysort station: egen nmth = count(value)
	hist nmth, frequency
	gen b6 = (nmth<=6)
	duplicates drop station b6, force
	tab b6
	restore
	// Conclusion: 334 stations (out of 358) have at most 6 months represented (by less than 365 observations)
	drop nobs
	// Bottom line, it's better to drop these stations with less than 365 obs since 
	// imputing from so few observations is a far stretch
	****************************************************************************/
	
	levelsof station, loc(stationID) clean
	// Drop stations with too few observations by any standards (less than 365)
	local stationlow "" // contains stations with at most 365 obs
	foreach ss of local stationID {
		qui sum value if station=="`ss'"
		if r(N)<=365 local stationlow "`stationlow' `ss'"
		if r(N)<=365 drop if station=="`ss'"
	}
	
	
	***********************
	***********************
	* ADD CODE FOR CIRCMEAN
	* DES 8/4/17: calculate the circular mean within an hour
	
	*caluclate sine and cosine of value, in radians and their within hour mean
	
	gen sinval = sin(value*(_pi/180))
	bysort station month day hour: egen avsin = mean(sinval) 

	gen cosval = cos(value*(_pi/180))
	bysort station month day hour: egen avcos = mean(cosval) 

	*now take the arctangent of ratio of the within hour average of sin to cos
	
	gen arctan = atan(avsin/avcos)
	
	*finally translate back into degrees and make a correction based on the qudrant
	* of the coordinates;
	
	gen circmean = ((180*arctan)/_pi)
	replace circmean = circmean + 180 if avcos<0
	replace circmean = circmean + 360 if avsin<0 & avcos>0
	
	sum circmean value
	
	*note there are some times when circmean is missing:
	* if in blowing north to south (or east to west) averages out to zero (or approximately zero);  
    * explains differences in values.  
	

	drop sinval cosval avsin avcos arctan
	***********************
	***********************
	
	// Now take care of multiple observations within an hour 
	// drop within hour observations if the reading is the same (duplicates)
	sort station datetime
	duplicates drop station value month day hour, force

	// there are still different readings within an hour; get modes
	bysort station month day hour: egen minmode_rawdir = mode(value), minmode 	
	bysort station month day hour: egen maxmode_rawdir = mode(value), maxmode 
	
	duplicates drop station year month day hour minmode_rawdir maxmode_rawdir circmean, force /* drop 451,695 when either not including circmean or when including circmean */
	
	
	// see http://climate.umn.edu/snow_fence/components/winddirectionanddegreeswithouttable3.htm
	foreach vv in minmode maxmode {
		gen `vv'_dir = . /*This splits wind direction into categories by degree from N*/
		replace `vv'_dir = 1 if `vv'_rawdir >=348.75 &`vv'_rawdir <=360 	/*N*/
		replace `vv'_dir = 1 if `vv'_rawdir >=0      &`vv'_rawdir <=11.25 	/*N*/
		replace `vv'_dir = 2 if `vv'_rawdir >11.25   &`vv'_rawdir <=33.75 	/*NNE*/
		replace `vv'_dir = 3 if `vv'_rawdir >33.75   &`vv'_rawdir <=56.25 	/*NE*/
		replace `vv'_dir = 4 if `vv'_rawdir >56.25   &`vv'_rawdir <=78.75 	/*ENE*/
		replace `vv'_dir = 5 if `vv'_rawdir >78.75   &`vv'_rawdir <=101.25 	/*E*/
		replace `vv'_dir = 6 if `vv'_rawdir >101.25  &`vv'_rawdir <=123.75 	/*ESE*/
		replace `vv'_dir = 7 if `vv'_rawdir >123.75  &`vv'_rawdir <=146.25 	/*SE*/
		replace `vv'_dir = 8 if `vv'_rawdir >146.25  &`vv'_rawdir <=168.75 	/*SSE*/
		replace `vv'_dir = 9 if `vv'_rawdir >168.75  &`vv'_rawdir <=191.25 	/*S*/
		replace `vv'_dir =10 if `vv'_rawdir >191.25  &`vv'_rawdir <=213.75 	/*SSW*/
		replace `vv'_dir =11 if `vv'_rawdir >213.75  &`vv'_rawdir <=236.25 	/*SW*/
		replace `vv'_dir =12 if `vv'_rawdir >236.25  &`vv'_rawdir <=258.75 	/*WSW*/
		replace `vv'_dir =13 if `vv'_rawdir >258.75  &`vv'_rawdir <=281.25 	/*W*/
		replace `vv'_dir =14 if `vv'_rawdir >281.25  &`vv'_rawdir <=303.75 	/*WNW*/
		replace `vv'_dir =15 if `vv'_rawdir >303.75  &`vv'_rawdir <=326.25 	/*NW*/
		replace `vv'_dir =16 if `vv'_rawdir >326.25  &`vv'_rawdir <=348.75 	/*NNW*/
		
		gen cardinal_`vv' = ""
		replace cardinal_`vv' = "N" if `vv'_dir==1
		replace cardinal_`vv' = "NNE" if `vv'_dir ==2
		replace cardinal_`vv' = "NE" if `vv'_dir ==3
		replace cardinal_`vv' = "ENE" if `vv'_dir == 4
		replace cardinal_`vv' = "E" if `vv'_dir == 5
		replace cardinal_`vv' = "ESE" if `vv'_dir == 6
		replace cardinal_`vv' = "SE" if `vv'_dir == 7
		replace cardinal_`vv' = "SSE" if `vv'_dir == 8
		replace cardinal_`vv' = "S" if `vv'_dir == 9
		replace cardinal_`vv' = "SSW" if `vv'_dir == 10
		replace cardinal_`vv' = "SW" if `vv'_dir == 11
		replace cardinal_`vv' = "WSW" if `vv'_dir == 12
		replace cardinal_`vv' = "W" if `vv'_dir == 13
		replace cardinal_`vv' = "WNW" if `vv'_dir == 14
		replace cardinal_`vv' = "NW" if `vv'_dir == 15
		replace cardinal_`vv' = "NNW" if `vv'_dir == 16
	
	}
	
	la var value "Wind direction, degrees north [MADIS SFC]"	
	la var circmean "Wind direction, circular mean within an hour"
	la var minmode_rawdir "Smallest mode of wind direction, degrees north"
	la var maxmode_rawdir "Largest mode of wind direction, degrees north"
	la var minmode_dir "Smallest mode of wind direction, cardinal"
	la var maxmode_dir "Largest mode of wind direction, cardinal"
	la var cardinal_minmode "Smallest mode of wind direction, cardinal string"
	la var cardinal_maxmode "Largest mode of wind direction, cardinal string"
	
	rename value windvalue
	
	order sfcvar varlabel units station datetime circmean windvalue *minmode* *maxmode* lat lon year month day hour minute  
	keep sfcvar varlabel units station datetime *minmode* *maxmode* lat lon year month day hour minute elev windvalue circmean
	
	rename (varlabel units) (windvar windunits)
	la var elev "Wind monitor elevation (meters)"

	****************************************************************************
	/* Fill in data
		1. Carryforward within a station-month-day
		2. If an entire day is missing, populate it using carryforward (so using last full day available)
		Create indicator variables so we know which observations are filled
	*/
	
	// First expand the sample to include every day and hour between 8am-3pm
	gen dd = dofc(datetime)
	gen tt = hms(hour, 00,00)
	drop datetime
	gen double datetime = dd*24*60*60*1000 + tt
	format datetime %tcNN/DD/CCYY_HH:MM
	drop dd tt
	
	// fill in missing hourly observations 
	encode station, gen(stid)
	xtset stid datetime, delta(1 hour)
	tsfill, full
	gen newhh = hh(datetime)
	drop if newhh<8 | newhh>15
	drop newhh
	gen filled = (mi(station)) // indicates all missing observations
	drop station
	decode stid, gen(station)
	drop stid
	
	// now year month day hour are missing, get them again
	drop year month day hour
	gen year = year(dofc(datetime))
	gen month = month(dofc(datetime))
	gen day = day(dofc(datetime))
	gen hour = hh(datetime)
	
	// carryforward within a station-month-day
	foreach vv of varlist windvalue circmean *minmode* *maxmode* {
		gen im_`vv' = `vv'
		la var im_`vv' "Filled in `vv' using carryforward"
	}
	
	foreach vv of varlist windvalue circmean{
		gen im_`vv'_1dayout = im_`vv'
		la var im_`vv'_1dayout "Filled in `vv' using carryforward, 1 day out"
		gen im_`vv'_3dayout = im_`vv'
		la var im_`vv'_3dayout "Filled in `vv' using carryforward, 3 days out"
	}
	
	sort station month day hour
	by station month day: carryforward im_*, replace
	// fill in backwards
	gen negm = -month
	gen negd = -day
	gen negh = -hour
	gsort station negm negd negh
	by station negm negd: carryforward im_*,  replace
	
	// now deal with entire days missing; take closes available day around missing day and fill in
	// simply carryforward (and backward for january 1 missing)
	gen filled_day  = (mi(im_windvalue)) // indicator for entire days missing 
	// there are 64,440 (out of 1,269,330) that are still missing. These missing observations reflect
	// entire days missing 
	// Use carryforward within station-day-hour (can carryforward across months)
	// CREATE indicators for how many days out we're filling in
	sort station hour month day
	gen filled_day_f = mdy(month, day, year) if filled_day==0
	gen filled_day_b = filled_day_f
	gen realday =mdy(month, day, year)
	by station hour: carryforward filled_day_f, replace 
	gsort station negh negm negd 
	by station negh: carryforward filled_day_b,  replace
	format filled_day_f filled_day_b %td
	gen daydiff_f = filled_day_f - realday // negative values indicate that a missing day was filled in with a previous non-missing day
	gen daydiff_b = filled_day_b - realday 
	
	// now carryforward
	sort station hour month day
	by station hour: carryforward im_windvalue im_circmean, replace 
	by station hour: carryforward im_*1dayout if abs(daydiff_f)<=1, replace
	by station hour: carryforward im_*3dayout if abs(daydiff_f)<=3, replace
	
	// fill in backwards 
	// this method gives precedence to filling in forward, i.e. if May 1 and May 5 are not missing, 
	// May 2-4 are filled in from May 1 with 3 days out. With one day out, May 2 is from May 1 and
	// May 4 is filled in with May 5 values.  
	gsort station negh negm negd 
	by station negh: carryforward im_windvalue im_circmean,  replace
	by station negh: carryforward im_*1dayout if abs(daydiff_b)<=1, replace
	by station negh: carryforward im_*3dayout if abs(daydiff_b)<=3, replace

	drop negm negd negh filled_day_* realday daydiff_*
	
	replace sfcvar = "DD" if mi(sfcvar)
	replace windvar = "Wind direction" if mi(windvar)
	replace windunits = "deg north" if mi(windunits)
	 
	la var filled "=1 if expanded wind data, =0 if from raw data"
	la var filled_day "=1 if entire day missing was filled in with carryforward"
	****************************************************************************

	// save hourly wind direction dataset
	tempfile winddir
	save `winddir', replace
	cd "${main}"
	save winddir_lax_`yy', replace		// used for troubleshooting 

		
*******************************************************************************
// STEP 2. Find nearest wind sites to each EPA pollution site
// Find distance between each EPA site and each wind monitor that survived the 
// scrutiny of STEP 1.
// Keep only those wind monitors that are within 3 miles from an EPA site
// Merge EPA data with wind direction data based on nearest wind monitors  
*******************************************************************************

	// EPA site IDs are not unique, i.e. pollutant monitors in two different counties
	// might share the same ID (whyyyyyyy). Pre-step: get lon lat for each EPA site
	// within a county (sitenum-county level id)

	if runEPA {
		use "${main}\hourly_EPApol", clear
		// drop years before 2001 since wind data starts in 2001
		//	gen year = year(dofc(datetime))
		//	drop if year<2001
		// to speed up the code, save observations for yy only
		// first, fix the datetime variable
		gen dtemp = datetime + 2*60*1000	// add 2 minutes since the minutes are either 0 1 or 58 59
		gen dd = dofc(dtemp)
		gen hh = hh(dtemp)
		gen tt = hms(hh, 00, 00)
		format tt %tcHH:MM
		drop datetime
		gen double datetime = dd*24*60*60*1000 + tt
		format datetime %tcNN/DD/CCYY_HH:MM
		gen year = year(dofc(datetime))
		keep if year == `yy'
		
		// keep obs only between 8am and 3pm
		keep if hh >=8 & hh<=15
		
		duplicates drop sitenum county, force
		keep county sitenum latitude longitude
		rename (latitude longitude) (EPAlat_ EPAlon_)
		// get a sitenum-county level id
		tostring sitenum, replace
		gen sitecnty = county + " " + sitenum
		gen id = 1 
		save "${main}\EPAlonlat_FL_`yy'.dta", replace
	} // if runEPA

	use "${main}\EPAlonlat_FL_`yy'.dta", clear
	levelsof sitecnty, loc(epasite)
	rename sitecnty EPAsite
	tempfile epa
	save `epa', replace

	// Get wind site longitude latitude, then merge with epa 
	use `winddir', clear
	levelsof station, loc(stationID) clean
	scalar nwind = wordcount("`stationID'") // count number of stations
	keep station lat lon
	drop if mi(lat)
	drop if station==""
	duplicates drop station, force // some stations (esp on water) change lat-lon combination often, this keeps just one combo for each station
	gen id = _n
	rename (lat lon) (lat_ lon_)
	reshape wide lat_ lon_, i(id) j(station) string
	carryforward *, replace
	keep if id ==_N
	replace id = 1 

	merge 1:m id using `epa', nogen
	drop id
	order EPA*

 di "********************* CALCULATE DISTANCES START *****************************"
	// calculate distance between each EPA site and each wind site
	foreach stid of local stationID {
		nearstat EPAlat_ EPAlon_, near( lat_`stid' lon_`stid') distvar(dist_`stid') r(3958.761) replace
	}
	
 di "********************* CALCULATE DISTANCES END *****************************"
	
	
 di "********************* ORDER BY DISTANCE *****************************"
	
	rowsort dist_*, gen(dist1-dist`=nwind') // sort distances in ascending order
	
 di "********************* IDENTIFY STATIONS WITHIN X MILES*****************************"
	
	// identify closest wind stations within `prefdist' miles of pollution site
	forval vv = 1/`=nwind' {
		replace dist`vv' = . if dist`vv'>prefdist // exclude wind sites that are further than "prefdist" miles, prefdist specified at the very top 
		gen windsite`vv' = ""
		gen windlat`vv' = .
		gen windlon`vv' = .
		foreach ww of local stationID {
			replace windsite`vv' = "`ww'" if abs(dist`vv' - dist_`ww')<0.0001
			replace windlat`vv' = lat_`ww' if abs(dist`vv' - dist_`ww')<0.0001
			replace windlon`vv' = lon_`ww' if abs(dist`vv' - dist_`ww')<0.0001
		}
	}
	
	dropmiss *, force // drops distance variables that have all missing observations
	drop dist_* lat_* lon_*
	rename (EPAlat_ EPAlon_) (EPAlat EPAlon)
 
 di "********************* SAVE TEMPFILE NEAREST*****************************"
	
	tempfile nearest
	save `nearest', replace
	save "${main}\nearest.dta", replace

 di "********************* LOAD POLLUTION DATA *****************************"
	
	use "${main}\hourly_EPApol", clear
	
 di "********************* POLLUTION DATA LOADED *****************************"
	
	rename param pollutant
	// fix the datetime variable (i had coded it up weirdly the first time, just fixing that)
	gen dtemp = datetime + 2*60*1000	// add 2 minutes since the minutes are either 0 1 or 58 59
	gen dd = dofc(dtemp)
	gen hh = hh(dtemp)
	gen tt = hms(hh, 00, 00)
	format tt %tcHH:MM
	drop datetime
	gen double datetime = dd*24*60*60*1000 + tt
	format datetime %tcNN/DD/CCYY_HH:MM
	gen year = year(dofc(datetime))
	
	// keep only year specified above
	keep if year == `yy'
	// keep only between 8am and 3pm
	keep if hh >=8 & hh<=15
	
	drop dtemp dd hh tt
	
	tostring sitenum, replace
	gen sitecnty = county + " " + sitenum
	rename (sitecnty latitude longitude value units) (EPAsite EPAlat EPAlon EPAvalue EPAunits)
	keep EPAsite EPAl* pollutant EPAunits datetime EPAvalue year
	merge m:1 EPAsite using `nearest', nogen
	drop if pollutant=="" // will drop EPA sites that are actually not available during a particular year
	dropmiss *, force
	gen month = month(dofc(datetime))
	gen day = day(dofc(datetime))
	gen hour = hh(datetime)
	
	gen namepol = "PM25" if strmatch(pollutant, "*PM2.5*AQI*")
	replace namepol = "PM25_BC" if strmatch(pollutant, "Black*PM2.5*")
	replace namepol = "PM25_loc" if strmatch(pollutant, "*PM2.5*Local*")
	replace namepol = "PM10" if strmatch(pollutant, "*PM10*Total*")
	replace namepol = "CO" if strmatch(pollutant, "*Carbon*mono*")
	replace namepol = "NO" if strmatch(pollutant, "*Nitric*")
	replace namepol = "NO2" if strmatch(pollutant, "*Nitrogen*di*")
	replace namepol = "NOx" if strmatch(pollutant, "*Oxides*of*")
	replace namepol = "NOy" if strmatch(pollutant, "*Reactive*nitrogen*")
	replace namepol = "SO2" if strmatch(pollutant, "*Sulfur*")
	replace namepol = "Ozone" if pollutant=="Ozone"
	rename pollutant pollutantold
	rename namepol pollutant 
	
	// save a dataset with EPA data and nearest wind sites (<`prefdist' miles) associated with it
	save `nearest', replace
	save "${main}\nearest.dta", replace // use for troubleshooting
	ds windsite*
	scalar nwindsite = wordcount("`r(varlist)'") // max number of windsites within `prefdist' miles

di "********************* STEP 3 *****************************"
	
*******************************************************************************
// STEP 3. Merge with wind direction data
// Since each EPA site has at least one wind monitor associated with it, 
// do the merging in layers (first merge based on the first closest, then 
// second closest etc)
*******************************************************************************
	
	forval vv = 1/`=nwindsite' { // if we're only interested in the closest monitor use vv=1/1, otherwise use `=nwindsite' {
		if `vv' == 1 local ind "1st"
		if `vv' == 2 local ind "2nd"
		if `vv' == 3 local ind "3rd"
		if `vv'>3 local ind "`vv'th"
		
		use `winddir', clear
		preserve
		gen windsite`vv' = station
		merge 1:m windsite`vv' year month day hour using `nearest', nogen
		// fix datetime to have minute be 00 (i did this when filling in in STEP 1 but just in case)
		gen dd = dofc(datetime)
		gen tt = hms(hour, 00,00)
		drop datetime
		gen double datetime = dd*24*60*60*1000 + tt
		format datetime %tcNN/DD/CCYY_HH:MM
		drop dd tt
		
		keep datetime EPAsite pollutant EPAunits EPAvalue EPAl* windsite`vv' *circmean* *windvalue* *mode* dist`vv' windlat`vv' windlon`vv' windvar windunits  minute pollutantold filled filled_day
		order datetime EPAsite pollutant EPAunits EPAvalue windsite`vv' *circmean* *windvalue* *mode* dist`vv' EPAl* windlat`vv' windlon`vv' windvar windunits
		sort EPAsite pollutant datetime
		rename *mode* *mode*`vv'
		rename (*windvalue* *circmean* minute) (*windvalue*`vv' *circmean*`vv' minute`vv')
		
		la var windvalue`vv' "Wind direction, `ind' closest, degrees north [MADIS SFC]"	
		la var im_windvalue`vv' "Filled in, wind direction, `ind' closest, degrees north [MADIS SFC]"
		la var im_windvalue_1dayout`vv' "Filled in 1 day out, wind direction, `ind' closest, degrees north [MADIS SFC]"
		la var im_windvalue_3dayout`vv' "Filled in 3 days out, wind direction, `ind' closest, degrees north [MADIS SFC]"
		la var circmean`vv' "Wind direction, `ind' closest, circular mean"
		la var im_circmean`vv' "Filled in, wind direction, `ind' closest, circular mean"
		la var im_circmean_1dayout`vv' "Filled in 1 day out, wind direction, `ind' closest, circular mean"
		la var im_circmean_3dayout`vv' "Filled in 3 day out, wind direction, `ind' closest, circular mean"
		la var minute`vv' "Minute of reading of windvalue`vv' [MADIS SFC]"
		la var minmode_rawdir`vv' "Smallest mode of wind direction, `ind' closest, degrees north"
		la var im_minmode_rawdir`vv' "Filled in, smallest mode of wind direction, `ind' closest, degrees north"
		la var maxmode_rawdir`vv' "Largest mode of wind direction, `ind' closest, degrees north"
		la var im_maxmode_rawdir`vv' "Filled in, largest mode of wind direction, `ind' closest, degrees north"
		la var minmode_dir`vv' "Smallest mode of wind direction, `ind' closest, cardinal"
		la var im_minmode_dir`vv' "Filled in, smallest mode of wind direction, `ind' closest, cardinal"
		la var maxmode_dir`vv' "Largest mode of wind direction, `ind' closest, cardinal"
		la var im_maxmode_dir`vv' "Filled in, largest mode of wind direction, `ind' closest, cardinal"
		la var cardinal_minmode`vv' "Smallest mode of wind direction, `ind' closest, cardinal string"
		la var im_cardinal_minmode`vv' "Filled in, smallest mode of wind direction, `ind' closest, cardinal string"
		la var cardinal_maxmode`vv' "Largest mode of wind direction, `ind' closest, cardinal string"
		la var im_cardinal_maxmode`vv' "Filled in, largest mode of wind direction, `ind' closest, cardinal string"
		la var windsite`vv' "`ind' closest wind monitor"
		la var dist`vv' "Distance (mi) from `ind' closest wind monitor"
		la var windlat`vv' "Latitude of `ind' closest wind monitor"
		la var windlon`vv' "Longitude of `ind' closest wind monitor"
		
		drop if mi(EPAsite)
		duplicates drop
		duplicates tag  datetime EPAsite pollutant  , gen(dups)
		drop if dups==1 & EPAvalue==0
		sort datetime EPAsite pollutant 
		egen dups1 = seq(), by(datetime EPAsite pollutant) 
		drop if dups1==2
		drop dup*
		tempfile merged`vv'
		save `merged`vv'', replace
		restore
		
		
	}
		
	use `merged1', clear
	duplicates drop  
	duplicates tag  datetime EPAsite pollutant  , gen(dups)
	drop if dups==1 & EPAvalue==0
	sort datetime EPAsite pollutant 
	egen dups1 = seq(), by(datetime EPAsite pollutant) 
	drop if dups1==2
 // uncomment next 3 lines if interested in 2nd, 3rd etc closest monitor
	forval vv = 2/`=nwindsite' {
		merge 1:1 datetime EPAsite pollutant using `merged`vv'', nogen
	}

	sort EPAsite pollutant datetime
	order *mode*, last
	cd "${main}"
	save wind_pollution_nearest_FL_`yy'_`=prefdist'mi_lax, replace
		
}
	


